In [1]:
import numpy as np
import pandas as pd

In [2]:
%%writefile 1.txt
This is a story about cats
our feline pets
Cats are furry animals


Overwriting 1.txt

In [3]:
%%writefile 2.txt
This story is about surfing
Catching waves is fun
Surfing is a popular water sport


Overwriting 2.txt

Building the vocab


In [6]:
vocab = {}
i = 1

with open('1.txt') as f:
    file = f.read().lower().split()
    
    for word in file:
        if word in vocab:
            continue
        else:
            vocab[word] = i
            i+=1

print(vocab)


{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12}

In [7]:
with open('2.txt') as f:
    file = f.read().lower().split()
    
    for word in file:
        if word in vocab:
            continue
        else:
            vocab[word] = i
            i+=1
            
print(vocab)


{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12, 'surfing': 13, 'catching': 14, 'waves': 15, 'fun': 16, 'popular': 17, 'water': 18, 'sport': 19}

Feature Extraction


In [11]:
file_one = ['1.txt'] + [0]*len(vocab)
file_two = ['2.txt'] + [0]*len(vocab)

print(file_one)
print(file_two)


['1.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['2.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [15]:
with open('1.txt') as f:
    for word in f.read().lower().split():
        file_one[vocab[word]]+= 1

file_one


Out[15]:
['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]

In [16]:
with open('2.txt') as f:
    for word in f.read().lower().split():
        file_two[vocab[word]]+= 1

file_two


Out[16]:
['2.txt', 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]

In [17]:
print(file_one)
print(file_two)


['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
['2.txt', 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]

In [ ]: